To predict the concrete strength using the data available in file "concrete.csv". Apply feature engineering and model tuning to obtain 85% to 95% accuracy.
The data for this project is available in file https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/. The same has been shared along with the course content.
Exploratory Data Quality Report Reflecting the Following:
Algorithms that you think will be suitable for this project. Use Kfold Cross-Validation to evaluate model performance. Use appropriate metrics and make a DataFrame to compare models w.r.t their metrics. (at least 3 algorithms, one bagging and one boosting based algorithms have to be there). (15 marks) Techniques employed to squeeze that extra performance out of the model without making it overfit. Use Grid Search or Random Search on any of the two models used above. Make a DataFrame to compare models after hyperparameter tuning and their metrics as above. (15 marks)
Given are the variable name, variable type, the measurement unit, and a brief description. The concrete compressive strength is the regression problem. The order of this listing corresponds to the order of numerals along the rows of the database.
Name Data Type Measurement Description
#Load Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
concrete_df = pd.read_csv("concrete.csv")
concrete_df.head(10)
concrete_df.tail(10)
# Check to see the number of records in the dataset
concrete_df.shape
concrete_df.info()
concrete_df.dtypes
# Check to see if data has any missing values
concrete_df.isnull().any()
#Analyze the distribution of the dataset
concrete_df.describe(include = 'all')
concrete_df.describe(include = 'all').transpose()
print(concrete_df.columns)
num_cols = ['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'age','strength']
from matplotlib.cbook import boxplot_stats
# Central tendency distribution
i = 1
for col in num_cols:
print('Attribute: ',col)
print(boxplot_stats(concrete_df[col]))
plt.figure(figsize=(10,8))
sns.boxplot(concrete_df[col])
plt.show();
i = i+1
print('-------------------------------------------------------------------------')
# Skewness distribution
i = 1
for col in num_cols:
plt.figure(figsize=(10,8))
sns.distplot(concrete_df[col])
i = i+1
# Check the unique values in each column of the dataframe
concrete_df.nunique()
import pandas_profiling
concrete_df.profile_report()
from pandas_profiling import ProfileReport
profile = ProfileReport(concrete_df, title="Pandas Profiling Report")
profile.to_widgets()
concrete_df.strength.nunique()
concrete_df.strength.value_counts()
# Plot the distribution of the target attribute
plt.figure(figsize=(8,6))
sns.distplot(concrete_df['strength']);
plt.axvline(concrete_df['strength'].mean(), linestyle="dashed", color="g", label='mean', linewidth=2)
plt.legend(loc="best")
plt.show()
for col in num_cols:
plt.figure(figsize=(12,12))
ax = sns.regplot(x=col,y=concrete_df['strength'], data=concrete_df);
sns.pairplot(data = concrete_df,diag_kind ='kde');
# Correlation matrix
corr = concrete_df.corr()
corr
# Heatmap
sns.set(font_scale=1.15)
fig,ax=plt.subplots(figsize=(16,16))
sns.heatmap(corr, cmap='GnBu',annot=True,linewidths=0.01,center=0,linecolor='white',square=True)
plt.title('Correlation between attributes',fontsize=16)
ax.tick_params(labelsize=14)
# Identify duplicates records in the data
dupes = concrete_df.duplicated()
sum(dupes)
duplicate = concrete_df[concrete_df.duplicated()]
print("Duplicate Rows :")
duplicate
# Removing Duplicates
concrete_df = concrete_df.drop_duplicates()
dupes = concrete_df.duplicated()
sum(dupes)
# Check to see the number of records in the dataset
concrete_df.shape
Q1 = concrete_df.quantile(0.25)
Q3 = concrete_df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
np.where((concrete_df < (Q1 - 1.5 * IQR)) | (concrete_df > (Q3 + 1.5 * IQR)))
# Check for outliers in dataset using boxplot
fig,ax = plt.subplots(figsize=(35,20))
sns.boxplot(data=concrete_df,width=0.5,palette='GnBu',fliersize=7);
concrete_df_new = concrete_df.copy()
# Central tendency distribution
i = 1
for col in num_cols:
print('Attribute: ',col)
Q1 = concrete_df[col].quantile(q=0.25)
Q3 = concrete_df[col].quantile(q=0.75)
lower_limit = Q1-1.5*(Q3-Q1)
upper_limit = Q3+1.5*(Q3-Q1)
print('Lower Limit: ', lower_limit)
print('Upper Limit: ', upper_limit)
outliers = len(concrete_df.loc[(concrete_df[col] < (lower_limit)) | (concrete_df[col] > (upper_limit))])
print("Total number of Outliers:",outliers)
i = i+1
print('-------------------------------------------------------------------------')
out_cols = ['slag','water', 'fineagg']
#Replacing the outliers by Min/Max Replacement:
for col in out_cols:
q1 = concrete_df_new[col].quantile(0.25)
q3 = concrete_df_new[col].quantile(0.75)
iqr = q3 - q1
min_val = q1-1.5*iqr
max_val = q3+1.5*iqr
concrete_df_new[col].loc[concrete_df_new[col] > max_val] = max_val
concrete_df_new.shape, concrete_df.shape
# Check for outliers in dataset using boxplot
fig,ax = plt.subplots(figsize=(35,20))
sns.boxplot(data=concrete_df_new,width=0.5,palette='BuPu',fliersize=7);
sns.pairplot(data = concrete_df_new,diag_kind ='kde');
# Correlation matrix
corr = concrete_df_new.corr()
corr
# Heatmap
sns.set(font_scale=1.15)
fig,ax=plt.subplots(figsize=(16,16))
sns.heatmap(corr, cmap='BuPu',annot=True,linewidths=0.01,center=0,linecolor='white',square=True)
plt.title('Correlation between attributes',fontsize=16)
ax.tick_params(labelsize=14)
target_corr = abs(corr['strength'])
#Selecting highly correlated features
rel_features = target_corr[target_corr>0.30]
rel_features
print(concrete_df_new[["cement","superplastic"]].corr())
print(concrete_df_new[["age","cement"]].corr())
#Scaling the dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn import metrics
import numpy as np
std_scale = StandardScaler()
concrete_df_scaled = pd.DataFrame(std_scale.fit_transform(concrete_df_new),columns = concrete_df_new.columns)
# Dropping the least significant features
X = concrete_df_scaled.drop(['slag','ash','strength'], axis=1)
#X = concrete_df_scaled.drop(['slag','ash','strength','fineagg','coarseagg'], axis=1)
Y = concrete_df_scaled[['strength']]
# Split into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=1)
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
linreg = LinearRegression()
linreg_scores = cross_val_score(linreg, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True),scoring='neg_mean_squared_error')
linreg.fit(X_train, y_train)
print(linreg.coef_)
print()
# Calculate the score of Linear Regression
print('Training score :', linreg.score(X_train, y_train))
print('Testing score :', linreg.score(X_test, y_test))
# Calculate average RMSE
print('RMSE:',np.sqrt(-(linreg_scores)).mean())
#Store the results for each model in a dataframe for final comparison
results_df = pd.DataFrame({'Method':['Logistic Regression'],'Training Score': linreg.score(X_train, y_train).round(3),'Testing score': linreg.score(X_test, y_test).round(3),
'RMSE': np.sqrt(-(linreg_scores)).mean().round(3)})
results_df
from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 0.01,random_state=1)
ridge_scores = cross_val_score(ridge, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
print()
# Calculate the score of Ridge Regressor
print('Training Score:',ridge.score(X_train, y_train))
print('Testing Score:',ridge.score(X_test, y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(ridge_scores)).mean())
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Ridge Regression'],'Training Score': ridge.score(X_train, y_train).round(3),'Testing score': ridge.score(X_test, y_test).round(3),
'RMSE': np.sqrt(-(ridge_scores)).mean().round(3)})
results_df = results_df.append(r_df)
results_df
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 0.01,random_state=1)
lasso_scores = cross_val_score(lasso, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))
print()
# Calculate the score of Lasso Regressor
print('Training Score:',lasso.score(X_train, y_train))
print('Testing Score:',lasso.score(X_test, y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(lasso_scores)).mean())
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Lasso Regression'],'Training Score': lasso.score(X_train, y_train).round(3),'Testing score': lasso.score(X_test, y_test).round(3),
'RMSE': np.sqrt(-(lasso_scores)).mean().round(3)})
results_df = results_df.append(r_df)
results_df
from sklearn.svm import SVR
svr = SVR()
svr_scores = cross_val_score(svr, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
svr.fit(X_train, y_train)
# Calculate the score of SVM Regressor
print('Training Score:',svr.score(X_train, y_train))
print('Testing Score:',svr.score(X_test, y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(svr_scores)).mean())
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['SVM Regression'],'Training Score': svr.score(X_train, y_train).round(3),'Testing score': svr.score(X_test, y_test).round(3),
'RMSE': np.sqrt(-(svr_scores)).mean().round(3)})
results_df = results_df.append(r_df)
results_df
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor(random_state=1)
dtree_scores = cross_val_score(dtree, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
dtree.fit(X_train, y_train)
#Making the prediction
y_predict = dtree.predict(X_test)
print (pd.DataFrame(dtree.feature_importances_, columns = ["Imp"], index = X_train.columns))
# Calculate the score of Decision Tree Regressor
print('Training Score:',dtree.score(X_train,y_train))
print('Testing Score:',dtree.score(X_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(dtree_scores)).mean())
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Decision Tree'],'Training Score': dtree.score(X_train, y_train).round(3),'Testing score': dtree.score(X_test, y_test).round(3),
'RMSE': np.sqrt(-(dtree_scores)).mean().round(3)})
results_df = results_df.append(r_df)
results_df
Though the testing score has improved, there is quite a drop in accuracy from training score to testing score indicating this model is highly overfit and needs to be pruned.
dtree_pruned = DecisionTreeRegressor(max_depth=6,random_state=1)
dtree_pruned_scores = cross_val_score(dtree_pruned, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
dtree_pruned.fit(X_train, y_train)
#Making the prediction
y_predict = dtree_pruned.predict(X_test)
# Calculate the score of Decision Tree Regressor
print('Training Score:',dtree_pruned.score(X_train,y_train))
print('Testing Score:',dtree_pruned.score(X_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(dtree_pruned_scores)).mean())
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Decision Tree with Pruning'],'Training Score': dtree_pruned.score(X_train, y_train).round(3),'Testing score': dtree_pruned.score(X_test, y_test).round(3),
'RMSE': np.sqrt(-(dtree_pruned_scores)).mean().round(3)})
results_df = results_df.append(r_df)
results_df
from sklearn.ensemble import RandomForestRegressor
rforest = RandomForestRegressor(random_state=1)
rforest_scores = cross_val_score(rforest, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
rforest = rforest.fit(X_train, y_train)
y_predict = rforest.predict(X_test)
# Calculate the score of Random Forest Regressor
print('Training Score:',rforest.score(X_train,y_train))
print('Testing Score:',rforest.score(X_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(rforest_scores)).mean())
Similar is the case with Random Forest regressor. Though the testing score has improved, there is quite a drop in accuracy from training score to testing score indicating this model is highly overfit and needs to be pruned.
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Random Forest'],'Training Score': rforest.score(X_train, y_train).round(3),'Testing score': rforest.score(X_test, y_test).round(3),
'RMSE': np.sqrt(-(rforest_scores)).mean().round(3)})
results_df = results_df.append(r_df)
results_df
reg_rforest = RandomForestRegressor(n_estimators = 50,max_depth = 7,min_samples_leaf=5,random_state=1)
reg_rforest_scores = cross_val_score(reg_rforest, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
reg_rforest = reg_rforest.fit(X_train, y_train)
y_predict = reg_rforest.predict(X_test)
# Calculate the score of Random Forest Regressor
print('Training Score:',reg_rforest.score(X_train,y_train))
print('Testing Score:',reg_rforest.score(X_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(reg_rforest_scores)).mean())
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Random Forest with Pruning'],'Training Score': reg_rforest.score(X_train, y_train).round(3),'Testing score': reg_rforest.score(X_test, y_test).round(3),
'RMSE': np.sqrt(-(reg_rforest_scores)).mean().round(3)})
results_df = results_df.append(r_df)
results_df
from sklearn.ensemble import BaggingRegressor
bag = BaggingRegressor(random_state=1)
bag_scores = cross_val_score(bag, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
bag = bag.fit(X_train, y_train)
y_predict = bag.predict(X_test)
# Calculate the score of Bagging Regressor
print('Training Score:',bag.score(X_train,y_train))
print('Testing Score:',bag.score(X_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(bag_scores)).mean())
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Bagging'],'Training Score': bag.score(X_train, y_train).round(3),'Testing score': bag.score(X_test, y_test).round(3),
'RMSE': np.sqrt(-(bag_scores)).mean().round(3)})
results_df = results_df.append(r_df)
results_df
bag_svr = BaggingRegressor(base_estimator=SVR(),random_state=1)
bag_svr_scores = cross_val_score(bag_svr, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
bag_svr.fit(X_train, y_train)
y_predict = bag.predict(X_test)
# Calculate the score of Bagging Regressor
print('Training Score:',bag_svr.score(X_train,y_train))
print('Testing Score:',bag_svr.score(X_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(bag_svr_scores)).mean())
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Bagging with SVR'],'Training Score': bag_svr.score(X_train, y_train).round(3),'Testing score': bag_svr.score(X_test, y_test).round(3),
'RMSE': np.sqrt(-(bag_svr_scores)).mean().round(3)})
results_df = results_df.append(r_df)
results_df
from sklearn.ensemble import GradientBoostingRegressor
gboost = GradientBoostingRegressor(random_state=1)
gboost_scores = cross_val_score(gboost, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
gboost = gboost.fit(X_train, y_train)
y_predict = gboost.predict(X_test)
# Calculate the score of Gradient Boost Regressor
print('Training Score:',gboost.score(X_train,y_train))
print('Testing Score:',gboost.score(X_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(gboost_scores)).mean())
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Gradient Boost'],'Training Score': gboost.score(X_train, y_train).round(3),'Testing score': gboost.score(X_test, y_test).round(3),
'RMSE': np.sqrt(-(gboost_scores)).mean().round(3)})
results_df = results_df.append(r_df)
results_df
import xgboost as xgb
from xgboost import XGBRegressor
xgboost = XGBRegressor(random_state=1)
xgboost_scores = cross_val_score(xgboost, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
xgboost = xgboost.fit(X_train, y_train)
y_predict = xgboost.predict(X_test)
# Calculate the score of XGBoost Regressor
print('Training Score:',xgboost.score(X_train,y_train))
print('Testing Score:',xgboost.score(X_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(xgboost_scores)).mean())
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['XGBoost'],'Training Score': xgboost.score(X_train, y_train).round(3),'Testing score': xgboost.score(X_test, y_test).round(3),
'RMSE': np.sqrt(-(xgboost_scores)).mean().round(3)})
results_df = results_df.append(r_df)
results_df
Random Forest, Gradient Boost and XGBoost Regressors seem to be the best suited models with higher accuracy and lower RMSE scores. Let's tune the models and check for model performance at 95% confidence level.
from pprint import pprint
print('Parameters currently in use:\n')
pprint(rforest.get_params())
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
n_estimators = [200, 500,1000,1800,2000]
max_features = ['auto','sqrt']
max_depth = [8,10,20,30,40,50]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 5, 10]
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth,
min_samples_split = min_samples_split, max_features = max_features,
min_samples_leaf = min_samples_leaf)
# Create grid search using 5-fold cross validation
clf = GridSearchCV(rforest, param_grid, cv=5,n_jobs = -1,verbose=2)
# Fit grid search
best_model = clf.fit(X_train, y_train)
# View best hyperparameters
training_score = best_model.best_score_
print(best_model.best_score_)
print(best_model.best_params_)
# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(best_model.best_estimator_)
# Apply best params to fit this model and see training scores
#clfg = RandomForestRegressor(random_state=1, max_depth=10, n_estimators=2000, max_features='sqrt',min_samples_leaf=1, min_samples_split=2)
clfg = best_model.best_estimator_
print("R Squared:",cross_val_score(clfg, X_train, y_train.values.ravel(), cv=10, scoring='r2').mean())
clfg = clfg.fit(X_train, y_train.values.ravel())
test_RMSE = np.sqrt(mean_squared_error(y_test, clfg.predict(X_test)))
print("Test RMSE: ", np.sqrt(mean_squared_error(y_test, clfg.predict(X_test))))
test_score = r2_score(y_test, clfg.predict(X_test))
print("Test R^2: ", r2_score(y_test, clfg.predict(X_test)))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Random Forest with GridSearch CV'],'Training Score': training_score.round(3),'Testing score': test_score.round(3),
'RMSE': test_RMSE})
results_df = results_df.append(r_df)
results_df
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [200, 500,1000,1800,2000]
max_features = ['auto','sqrt']
max_depth = [8,10,20,30,40,50]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 5, 10]
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth,
min_samples_split = min_samples_split, max_features = max_features,
min_samples_leaf = min_samples_leaf)
# Create grid search using 5-fold cross validation
clf = RandomizedSearchCV(rforest, param_grid, cv=5, n_iter = 100,n_jobs = -1,verbose=2)
# Fit grid search
best_model = clf.fit(X_train, y_train)
print('Best hyper parameter:', best_model.best_params_)
# View best hyperparameters
training_score = best_model.best_score_
print(best_model.best_score_)
print(best_model.best_params_)
# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(best_model.best_estimator_)
# Apply best params to fit this model and see training scores
#clfg = RandomForestRegressor(random_state=1, max_depth=10, n_estimators=100, max_features='auto',min_samples_leaf=1, min_samples_split=2)
clfg = best_model.best_estimator_
print("R Squared:",cross_val_score(clfg, X_train, y_train.values.ravel(), cv=10, scoring='r2').mean())
clfg = clfg.fit(X_train, y_train.values.ravel())
test_RMSE = np.sqrt(mean_squared_error(y_test, clfg.predict(X_test)))
print("Test RMSE: ", np.sqrt(mean_squared_error(y_test, clfg.predict(X_test))))
test_score = r2_score(y_test, clfg.predict(X_test))
print("Test R^2: ", r2_score(y_test, clfg.predict(X_test)))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Random Forest with RandomSearch CV'],'Training Score': training_score.round(3),'Testing score': test_score.round(3),
'RMSE': test_RMSE})
results_df = results_df.append(r_df)
results_df
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
n_estimators = [200, 300, 400, 500,1000]
learning_rate = [0.001,0.05,0.01,0.1]
max_depth = [8,10,20,30,40]
subsample = [0.5, 0.6, 0.8, 0.9, 1.0]
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth,learning_rate = learning_rate,
subsample = subsample)
# Create grid search using 5-fold cross validation
clf = GridSearchCV(gboost, param_grid, cv=5, n_jobs = -1,verbose=2)
# Fit grid search
gboost_best_model = clf.fit(X_train, y_train)
# View best hyperparameters
training_score = gboost_best_model.best_score_
print(gboost_best_model.best_score_)
print(gboost_best_model.best_params_)
# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(gboost_best_model.best_estimator_)
# Apply best params to fit this model and see training scores
#clfg = GradientBoostingRegressor(random_state=1, max_depth=5, n_estimators=500, max_features='sqrt',learning_rate=0.05,subsample=0.5)
clfg = gboost_best_model.best_estimator_
print("R Squared:",cross_val_score(clfg, X_train, y_train.values.ravel(), cv=10, scoring='r2').mean())
clfg = clfg.fit(X_train, y_train.values.ravel())
test_RMSE = np.sqrt(mean_squared_error(y_test, clfg.predict(X_test)))
print("Test RMSE: ", np.sqrt(mean_squared_error(y_test, clfg.predict(X_test))))
test_score = r2_score(y_test, clfg.predict(X_test))
print("Test R^2: ", r2_score(y_test, clfg.predict(X_test)))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Gradient Boost with GridSearch CV'],'Training Score': training_score.round(3),'Testing score': test_score.round(3),
'RMSE': test_RMSE})
results_df = results_df.append(r_df)
results_df
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [200, 300, 400, 500,1000]
learning_rate = [0.001,0.05,0.01,0.1]
max_depth = [8,10,20,30,40]
subsample = [0.5, 0.6, 0.8, 0.9, 1.0]
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth,learning_rate = learning_rate,
subsample = subsample)
# Create grid search using 5-fold cross validation
clf = RandomizedSearchCV(gboost, param_grid, cv=5, n_jobs = -1,verbose=2)
# Fit grid search
gboost_best_model = clf.fit(X_train, y_train)
# View best hyperparameters
training_score = gboost_best_model.best_score_
print(gboost_best_model.best_score_)
print(gboost_best_model.best_params_)
# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(gboost_best_model.best_estimator_)
# Apply best params to fit this model and see training scores
#clfg = GradientBoostingRegressor(random_state=1, max_depth=6, n_estimators=200, max_features='sqrt',learning_rate=0.05,subsample=0.5)
clfg = gboost_best_model.best_estimator_
print("R Squared:",cross_val_score(clfg, X_train, y_train.values.ravel(), cv=10, scoring='r2').mean())
clfg = clfg.fit(X_train, y_train.values.ravel())
test_RMSE = np.sqrt(mean_squared_error(y_test, clfg.predict(X_test)))
print("Test RMSE: ", np.sqrt(mean_squared_error(y_test, clfg.predict(X_test))))
test_score = r2_score(y_test, clfg.predict(X_test))
print("Test R^2: ", r2_score(y_test, clfg.predict(X_test)))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['Gradient Boost with RandomSearch CV'],'Training Score': training_score.round(3),'Testing score': test_score.round(3),
'RMSE': test_RMSE})
results_df = results_df.append(r_df)
results_df
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
#n_estimators = [50, 100, 150, 200]
#learning_rate = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ]
#max_depth = [ 3, 4, 5, 6, 8, 10, 12, 15]
#min_child_weight = [ 1, 3, 5, 7 ]
#gamma = [ 0.0, 0.1, 0.2 , 0.3, 0.4 ]
#colsample_bytree = [ 0.3, 0.4, 0.5 , 0.7 ]
n_estimators = [50, 100, 150, 200]
learning_rate = [0.01, 0.1, 0.2, 0.3]
max_depth = range(3, 10)
colsample_bytree = [i/10.0 for i in range(1, 3)]
gamma = [i/10.0 for i in range(3)]
param_grid = dict(n_estimators = n_estimators, learning_rate = learning_rate, max_depth = max_depth,
colsample_bytree = colsample_bytree,
gamma = gamma)
# Create grid search using 5-fold cross validation
clf = GridSearchCV(xgboost, param_grid, cv=5, n_jobs = -1,verbose=2)
# Fit grid search
xgboost_best_model = clf.fit(X_train, y_train)
# View best hyperparameters
training_score = xgboost_best_model.best_score_
print(xgboost_best_model.best_score_)
print(xgboost_best_model.best_params_)
# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(xgboost_best_model.best_estimator_)
# Apply best params to fit this model and see training scores
clfg = xgboost_best_model.best_estimator_
print("R Squared:",cross_val_score(clfg, X_train, y_train.values.ravel(), cv=10, scoring='r2').mean())
clfg = clfg.fit(X_train, y_train.values.ravel())
test_RMSE = np.sqrt(mean_squared_error(y_test, clfg.predict(X_test)))
print("Test RMSE: ", np.sqrt(mean_squared_error(y_test, clfg.predict(X_test))))
test_score = r2_score(y_test, clfg.predict(X_test))
print("Test R^2: ", r2_score(y_test, clfg.predict(X_test)))
#Store the accuracy results for each model in a dataframe for final comparison
r_df = pd.DataFrame({'Method':['XGBoost with GridSearch CV'],'Training Score': training_score.round(3),'Testing score': test_score.round(3),
'RMSE': test_RMSE})
results_df = results_df.append(r_df)
results_df
#Bootstrap Sampling
from sklearn.utils import resample
values = concrete_df_scaled.values
n_iterations = 500 # Number of bootstrap samples to create
n_size = int(len(concrete_df_scaled)) # size of a bootstrap sample
# run bootstrap
stats = list() # empty list that will hold the scores for each bootstrap iteration
for i in range(n_iterations):
# prepare train and test sets
train = resample(values, n_samples=n_size) # Sampling with replacement
# picking rest of the data not considered in sample
test = np.array([x for x in values if x.tolist() not in train.tolist()])
# fit model
gboost_boot = gboost_best_model.best_estimator_
gboost_boot.fit(train[:,:-1], train[:,-1]) # fit against independent variables and corresponding target values
y_test = test[:,-1] # Take the target column for all rows in test set
# evaluate model
predictions = gboost_boot.predict(test[:, :-1]) # predict based on independent variables in the test data
score = gboost_boot.score(test[:, :-1] , y_test)
stats.append(score)
# Plot confidence Interval
plt.figure(figsize=(20,5))
# plt.hist(stats)
sns.distplot(stats, bins=11)
alpha = 0.95 # for 95% confidence interval
p = ((1.0-alpha)/2.0) * 100 # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
plt.axvline(x=lower, color='r', linestyle=':')
plt.axvline(x=upper, color='r', linestyle=':', label='Confidence Interval Threshold')
plt.legend(loc="best",prop={"size":14})
print("With %.1f confidence interval, Gradient Boosting's score varies between %.1f%% and %.1f%%" %
(alpha*100, lower*100, upper*100))
With 100 samples, 95.0 confidence interval, Gradient Boosting's score varies between 88.4% and 93.1%
from sklearn.decomposition import PCA
concrete_df_pca = concrete_df_scaled.copy()
concrete_df_pca
# Initialize and fit pca
# Create a covariance matrix and calculate eigen values
from sklearn.decomposition import PCA
pca = PCA().fit(concrete_df_pca.drop('strength', axis=1))
# cumulative sum of variance explained with [n] features
eigen_vals = np.round(pca.explained_variance_ratio_, decimals=3)*100
np.cumsum(eigen_vals)
print(pca.explained_variance_)
print(pca.components_)
# calculate variance ratios
var = pca.explained_variance_ratio_;var
# Visualize the contribution of variance of each attribute
plt.figure(figsize=(10,8))
plt.bar(list(range(8)),pca.explained_variance_ratio_,alpha=0.5, align='center');
plt.ylabel('Variation')
plt.xlabel('Number of components')
# 6 attributes explaines 95% of variance overall
plt.figure(figsize=(10,8))
plt.step(list(range(8)),np.cumsum(pca.explained_variance_ratio_), where='mid');
plt.ylabel('Cumulative of variation')
plt.xlabel('Number of components')
There are 6 features which explains more than 95% of variance cumulatively in the dataset
# Create a new matrix using the n components
X = concrete_df_pca.drop('strength', axis=1)
X_proj = PCA(n_components=6).fit_transform(X)
y = concrete_df_pca['strength']
X_proj.shape
# Divide the original and the projected dataset into 70:30 ration for train and test
X_proj_train, X_proj_test, y_train, y_test = train_test_split(X_proj, y, test_size=0.3, random_state=1)
linreg = LinearRegression()
linreg_scores = cross_val_score(linreg, X_proj_train, y_train, cv=KFold(10, random_state = 1,shuffle = True),scoring='neg_mean_squared_error')
linreg.fit(X_proj_train, y_train)
print(linreg.coef_)
print()
# Calculate the score of Linear Regression
print('Training score :', linreg.score(X_proj_train, y_train))
print('Testing score :', linreg.score(X_proj_test, y_test))
# Calculate average RMSE
print('RMSE:',np.sqrt(-(linreg_scores)).mean())
dtree = DecisionTreeRegressor(random_state=1)
dtree_scores = cross_val_score(dtree, X_proj_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
dtree.fit(X_proj_train, y_train)
#Making the prediction
y_predict = dtree.predict(X_proj_test)
# Calculate the score of Decision Tree Regressor
print('Training Score:',dtree.score(X_proj_train,y_train))
print('Testing Score:',dtree.score(X_proj_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(dtree_scores)).mean())
rforest = RandomForestRegressor(random_state=1)
rforest_scores = cross_val_score(rforest, X_proj_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
rforest = rforest.fit(X_proj_train, y_train)
y_predict = rforest.predict(X_proj_test)
# Calculate the score of Random Forest Regressor
print('Training Score:',rforest.score(X_proj_train,y_train))
print('Testing Score:',rforest.score(X_proj_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(rforest_scores)).mean())
gboost = GradientBoostingRegressor(random_state=1)
gboost_scores = cross_val_score(gboost, X_proj_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
gboost = gboost.fit(X_proj_train, y_train)
y_predict = gboost.predict(X_proj_test)
# Calculate the score of Gradient Boost Regressor
print('Training Score:',gboost.score(X_proj_train,y_train))
print('Testing Score:',gboost.score(X_proj_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(gboost_scores)).mean())
It looks like all the regressors tried above with PCA performed very poorly in terms of training and test scores and with high RMSE scores on PCA projected dataset and as such are able to predict the compressive strength of the cement with an acceptable accuracy.
from sklearn.preprocessing import PolynomialFeatures
concrete_df_poly = concrete_df_new.copy()
concrete_df_poly
# Instantiate an poly object with 2nd degree
poly = PolynomialFeatures(degree=2, interaction_only=True)
# Fit and transform the X or input features
concrete_df_poly = poly.fit_transform(concrete_df_poly.drop('strength', axis=1))
# Get the shape of the newly generated features
print("Number of rows :",concrete_df_poly.shape[0])
print("Number of columns :",concrete_df_poly.shape[1])
# Join the strength column to create a polynomial dataset
concrete_df_poly = pd.DataFrame(concrete_df_poly,columns=poly.get_feature_names())
concrete_df_poly['strength'] = concrete_df_new['strength']
concrete_df_poly.shape
# Visualize the correlation matrix
plt.figure(figsize=(15,10))
sns.heatmap(concrete_df_poly.corr())
x = concrete_df_poly.corr()['strength']
upper_limit = 0.6
lower_limit = -0.5
fltr = list(map(lambda x: True if(x >= upper_limit or x <= lower_limit) else False, x))
x[fltr].sort_values()
This polynomial features dataset (with a degree 2) has very less correlationship coming out among all the features and with the strength column indicating that no newly generated features are a very good strong predictor for compressive strength. All have a coefficient less than or close to 0.5.
# Import package
import featuretools as ft
ft.list_primitives().tail(30)
# initialize entityset
es = ft.EntitySet(id = 'concrete_entitySet')
concrete_df_new
# initialize entity
es.entity_from_dataframe(entity_id = 'concrete_entity_id', index = 'id',dataframe = concrete_df_new, make_index = True)
es['concrete_entity_id'].df
es['concrete_entity_id']
concrete_new_features, feature_defs = ft.dfs(entityset = es, target_entity = 'concrete_entity_id',
trans_primitives = ['add_numeric_scalar', 'multiply_numeric','divide_numeric'])
concrete_new_features.head()
# Visualize the correlation matrix
plt.figure(figsize=(15,10))
sns.heatmap(concrete_new_features.corr())
# Correlation matrix
corr = concrete_new_features.corr()
corr
concrete_new_features
x = concrete_new_features.corr()['strength']
upper_limit = 0.6
lower_limit = -0.5
fltr = list(map(lambda x: True if(x >= upper_limit or x <= lower_limit) else False, x))
x[fltr].sort_values()
Extracting the composite features from above which will work with strength and adding them to the dataset
cols = list(concrete_df_new.columns)
cols.remove('id')
cols
cols.append('water / age')
cols.append('fineagg / age')
cols.append('coarseagg / age')
print(cols)
concrete_ft_eng = concrete_new_features[cols]
concrete_ft_eng
std_scale = StandardScaler()
concrete_ft_scaled = pd.DataFrame(std_scale.fit_transform(concrete_ft_eng),columns = concrete_ft_eng.columns)
X = concrete_ft_scaled.drop(['strength'], axis=1)
Y = concrete_ft_scaled[['strength']]
# Split into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=1)
rforest = RandomForestRegressor(random_state=1)
rforest_scores = cross_val_score(rforest, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
rforest = rforest.fit(X_train, y_train)
y_predict = rforest.predict(X_test)
# Calculate the score of Random Forest Regressor
print('Training Score:',rforest.score(X_train,y_train))
print('Testing Score:',rforest.score(X_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(rforest_scores)).mean())
gboost = GradientBoostingRegressor(random_state=1)
gboost_scores = cross_val_score(gboost, X_train, y_train, cv=KFold(10, random_state = 1,shuffle = True), scoring='neg_mean_squared_error')
gboost = gboost.fit(X_train, y_train)
y_predict = gboost.predict(X_test)
# Calculate the score of Gradient Boost Regressor
print('Training Score:',gboost.score(X_train,y_train))
print('Testing Score:',gboost.score(X_test,y_test))
# calculate the average RMSE
print('RMSE:',np.sqrt(-(gboost_scores)).mean())
Random forest model seems to be highly overfit as there is quite a drop in the accuracy of the test set and Gradientboosregressor performs better than randomforst regressor and hence it can be further tuned to squeeze the extra performance
n_estimators = [200, 300, 400, 500,1000]
learning_rate = [0.001,0.05,0.01,0.1]
max_depth = [8,10,20,30,40]
subsample = [0.5, 0.6, 0.8, 0.9, 1.0]
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth,learning_rate = learning_rate,
subsample = subsample)
# Create grid search using 5-fold cross validation
clf = GridSearchCV(gboost, param_grid, cv=5, n_jobs = -1,verbose=2)
# Fit grid search
gboost_best_model = clf.fit(X_train, y_train)
# View best hyperparameters
training_score = gboost_best_model.best_score_
print(gboost_best_model.best_score_)
print(gboost_best_model.best_params_)
# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print(gboost_best_model.best_estimator_)
# Apply best params to fit this model and see training scores
clfg = gboost_best_model.best_estimator_
print("R Squared:",cross_val_score(clfg, X_train, y_train.values.ravel(), cv=10, scoring='r2').mean())
clfg = clfg.fit(X_train, y_train.values.ravel())
test_RMSE = np.sqrt(mean_squared_error(y_test, clfg.predict(X_test)))
print("Test RMSE: ", np.sqrt(mean_squared_error(y_test, clfg.predict(X_test))))
test_score = r2_score(y_test, clfg.predict(X_test))
print("Test R^2: ", r2_score(y_test, clfg.predict(X_test)))
#Bootstrap Sampling
from sklearn.utils import resample
values = concrete_ft_scaled.values
n_iterations = 500 # Number of bootstrap samples to create
n_size = int(len(concrete_ft_scaled)) # size of a bootstrap sample
# run bootstrap
stats = list() # empty list that will hold the scores for each bootstrap iteration
for i in range(n_iterations):
# prepare train and test sets
train = resample(values, n_samples=n_size) # Sampling with replacement
# picking rest of the data not considered in sample
test = np.array([x for x in values if x.tolist() not in train.tolist()])
# fit model
gboost_boot = gboost_best_model.best_estimator_
gboost_boot.fit(train[:,:-1], train[:,-1]) # fit against independent variables and corresponding target values
y_test = test[:,-1] # Take the target column for all rows in test set
# evaluate model
predictions = gboost_boot.predict(test[:, :-1]) # predict based on independent variables in the test data
score = gboost_boot.score(test[:, :-1] , y_test)
stats.append(score)
# Plot confidence Interval
plt.figure(figsize=(20,5))
# plt.hist(stats)
sns.distplot(stats, bins=11)
alpha = 0.95 # for 95% confidence interval
p = ((1.0-alpha)/2.0) * 100 # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
plt.axvline(x=lower, color='r', linestyle=':')
plt.axvline(x=upper, color='r', linestyle=':', label='Confidence Interval Threshold')
plt.legend(loc="best",prop={"size":14})
print("With %.1f confidence interval, Gradient Boosting's score varies between %.1f%% and %.1f%%" %
(alpha*100, lower*100, upper*100))